suppressPackageStartupMessages({
library(here) # Path management
library(readr) # Better file reading
library(Rsubread) # Alignment feature counting
library(Tnseq) # Statistical testing
library(stringr) # Matching sample table to files
library(stringdist) # Matching sample table to files
library(pander) # Render non-interactive tables
library(dplyr) # Data wrangling
library(tidyr) # Data wrangling
library(ggplot2) # Plotting
library(plotly) # Plotting
library(ggrepel) # Plotting
library(writexl) # Writing xlsx documents
library(pheatmap) # Heatmaps
library(reactable) # Interactive tables
library(pheatmap)
library(Gviz)
library(corrplot)
})
my_theme <-theme_bw() +
theme(text = element_text(size = 18))
We read the available data. Here’s what’s what:
ta_sites <- here("genome","genes_tasites.saf")
ta_df <- read_tsv(ta_sites,
col_names = c("GeneID", "Chr", "Start", "End", "Strand"))
## Parsed with column specification:
## cols(
## GeneID = col_character(),
## Chr = col_character(),
## Start = col_double(),
## End = col_double(),
## Strand = col_character()
## )
sample_info <- read_csv(here("datafiles","sampledata.csv"), col_types = 'ccffffc')
bamfiles <- dir(here("analysis","03-align"),
pattern = "*\\.bam$",
full.names = TRUE)
if (file.exists(here("datafiles", "genes_of_interest.txt"))) {
gois <- read_lines(here("datafiles", "genes_of_interest.txt"))
} else {
gois <- character()
}
Check for order disparity between mapping files and the sample information data and reorder if needed. Manually verify that the order is correct
closest <- function(a, b) {
which.min(stringdist(a, b))
}
ind <- sapply(bamfiles, closest, sample_info$Sample)
sample_info <- sample_info[ind, ]
pander(data.frame(A = sample_info$Sample, B = basename(bamfiles)))
| A | B |
|---|---|
| C58wt-1-1.fastq.gz | C58wt-1-1_trimmed_dec.fq.gz.sam.bam |
| C58wt-1-2.fastq.gz | C58wt-1-2_trimmed_dec.fq.gz.sam.bam |
| C58wt-2-1.fastq.gz | C58wt-2-1_trimmed_dec.fq.gz.sam.bam |
| C58wt-2-2.fastq.gz | C58wt-2-2_trimmed_dec.fq.gz.sam.bam |
| C58wt-3-1.fastq.gz | C58wt-3-1_trimmed_dec.fq.gz.sam.bam |
| C58wt-3-2.fastq.gz | C58wt-3-2_trimmed_dec.fq.gz.sam.bam |
| D13-1-1.fastq.gz | D13-1-1_trimmed_dec.fq.gz.sam.bam |
| D13-1-2.fastq.gz | D13-1-2_trimmed_dec.fq.gz.sam.bam |
| D13-2-1.fastq.gz | D13-2-1_trimmed_dec.fq.gz.sam.bam |
| D13-2-2.fastq.gz | D13-2-2_trimmed_dec.fq.gz.sam.bam |
| D13-3-1.fastq.gz | D13-3-1_trimmed_dec.fq.gz.sam.bam |
| D13-3-2.fastq.gz | D13-3-2_trimmed_dec.fq.gz.sam.bam |
suppressWarnings({
dir.create(here("analysis/","04-statistical-analysis"), recursive = TRUE)
dir.create(here("analysis/","04-statistical-analysis", "plots"), recursive = TRUE)
dir.create(here("analysis/","04-statistical-analysis", "tables"), recursive = TRUE)
})
Here we count how many reads overlap each feature (i.e. a TA site). The default is to allow a read to be counted for multiple TA sites. If you set allowMultiOverlap = FALSE then only reads which are overlapping only a single TA site would be counted. Change this only if you know what you are doing. In the following example Read 3 would never be counted since it overlaps two TA sites and Read 2 could not be counted since it overlaps a single TA site in two separate genes (on either strand).
Genome -----TATA------
Gene5'3' <<<<<<<<<<<<<<<
Gene3'5' >>>>>>>
Read 1 =====
Read 2 ====
Read 3 =====
Read 4 =====
sr_counts <- featureCounts(bamfiles,
annot.ext = ta_df,
useMetaFeatures = FALSE,
allowMultiOverlap = TRUE)
##
## ========== _____ _ _ ____ _____ ______ _____
## ===== / ____| | | | _ \| __ \| ____| /\ | __ \
## ===== | (___ | | | | |_) | |__) | |__ / \ | | | |
## ==== \___ \| | | | _ <| _ /| __| / /\ \ | | | |
## ==== ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
## ========== |_____/ \____/|____/|_| \_\______/_/ \_\_____/
## Rsubread 2.2.6
##
## //========================== featureCounts setting ===========================\\
## || ||
## || Input files : 12 BAM files ||
## || o C58wt-1-1_trimmed_dec.fq.gz.sam.bam ||
## || o C58wt-1-2_trimmed_dec.fq.gz.sam.bam ||
## || o C58wt-2-1_trimmed_dec.fq.gz.sam.bam ||
## || o C58wt-2-2_trimmed_dec.fq.gz.sam.bam ||
## || o C58wt-3-1_trimmed_dec.fq.gz.sam.bam ||
## || o C58wt-3-2_trimmed_dec.fq.gz.sam.bam ||
## || o D13-1-1_trimmed_dec.fq.gz.sam.bam ||
## || o D13-1-2_trimmed_dec.fq.gz.sam.bam ||
## || o D13-2-1_trimmed_dec.fq.gz.sam.bam ||
## || o D13-2-2_trimmed_dec.fq.gz.sam.bam ||
## || o D13-3-1_trimmed_dec.fq.gz.sam.bam ||
## || o D13-3-2_trimmed_dec.fq.gz.sam.bam ||
## || ||
## || Annotation : R data.frame ||
## || Dir for temp files : . ||
## || Threads : 1 ||
## || Level : feature level ||
## || Paired-end : no ||
## || Multimapping reads : counted ||
## || Multi-overlapping reads : counted ||
## || Min overlapping bases : 1 ||
## || ||
## \\============================================================================//
##
## //================================= Running ==================================\\
## || ||
## || Load annotation file .Rsubread_UserProvidedAnnotation_pid242392 ... ||
## || Features : 90971 ||
## || Meta-features : 5223 ||
## || Chromosomes/contigs : 4 ||
## || ||
## || Process BAM file C58wt-1-1_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1299548 ||
## || Successfully assigned alignments : 699378 (53.8%) ||
## || Running time : 0.02 minutes ||
## || ||
## || Process BAM file C58wt-1-2_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1546203 ||
## || Successfully assigned alignments : 831070 (53.7%) ||
## || Running time : 0.03 minutes ||
## || ||
## || Process BAM file C58wt-2-1_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1518710 ||
## || Successfully assigned alignments : 825079 (54.3%) ||
## || Running time : 0.03 minutes ||
## || ||
## || Process BAM file C58wt-2-2_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1828389 ||
## || Successfully assigned alignments : 992267 (54.3%) ||
## || Running time : 0.03 minutes ||
## || ||
## || Process BAM file C58wt-3-1_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1272892 ||
## || Successfully assigned alignments : 702456 (55.2%) ||
## || Running time : 0.02 minutes ||
## || ||
## || Process BAM file C58wt-3-2_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1377246 ||
## || Successfully assigned alignments : 758786 (55.1%) ||
## || Running time : 0.03 minutes ||
## || ||
## || Process BAM file D13-1-1_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1700999 ||
## || Successfully assigned alignments : 958862 (56.4%) ||
## || Running time : 0.03 minutes ||
## || ||
## || Process BAM file D13-1-2_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 2074260 ||
## || Successfully assigned alignments : 1169853 (56.4%) ||
## || Running time : 0.04 minutes ||
## || ||
## || Process BAM file D13-2-1_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1647931 ||
## || Successfully assigned alignments : 859458 (52.2%) ||
## || Running time : 0.03 minutes ||
## || ||
## || Process BAM file D13-2-2_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1989092 ||
## || Successfully assigned alignments : 1034167 (52.0%) ||
## || Running time : 0.03 minutes ||
## || ||
## || Process BAM file D13-3-1_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1412599 ||
## || Successfully assigned alignments : 783787 (55.5%) ||
## || Running time : 0.02 minutes ||
## || ||
## || Process BAM file D13-3-2_trimmed_dec.fq.gz.sam.bam... ||
## || Single-end reads are included. ||
## || Total alignments : 1579044 ||
## || Successfully assigned alignments : 873030 (55.3%) ||
## || Running time : 0.03 minutes ||
## || ||
## || Write the final count table. ||
## || Write the read assignment summary. ||
## || ||
## \\============================================================================//
sr_counts$countsNames <- dimnames(sr_counts$counts)
sr_counts$counts <- unname(sr_counts$counts)
We sum up over technical replicates even if there are none (or only one)
sr_counts$tr_counts <- t(apply(sr_counts$counts, 1, tapply,
sample_info$TechnicalReplicate, sum))
sample_info_tr <- do.call(rbind,
lapply(split(sample_info, sample_info$TechnicalReplicate), function(f) {
x <- head(f, 1)
x$Name <- paste(x$Condition, x$BiologicalReplicate, x$Pool, sep="-")
x
})
)
sr_counts$tr_counts <- sr_counts$tr_counts[, sample_info_tr$TechnicalReplicate]
x <- as_tibble(sr_counts$counts)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
colnames(x) <- sample_info$Name
x$Gene <- sr_counts$annotation$GeneID
x %>%
pivot_longer(-Gene) %>%
left_join(sample_info, by = c("name" = "Name")) -> x
x %>%
group_by(Gene, Condition) %>%
summarise(PercentDisrupted = sum(value > 0) / length(value) * 100) %>%
ggplot(aes(x = PercentDisrupted, fill = Condition)) +
geom_histogram(bins = 50) +
facet_wrap(~Condition) +
my_theme -> pl
## `summarise()` regrouping output by 'Gene' (override with `.groups` argument)
ggsave(here("analysis", "04-statistical-analysis/", "plots", "percent_disruption.pdf"),
width = 12, height = 10, plot = pl)
ggplotly(pl)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
See https://dx.doi.org/10.1186%2Fs12859-017-1745-2 for details.
tnseq <- TnseqDiff(countData = sr_counts$tr_counts,
geneID = ta_df$GeneID,
location = ta_df$Start,
pool = sample_info_tr$Pool,
condition = sample_info_tr$Condition)
## [1] "Finish Estimation for Pool 1"
Adjust P values for multiple testing. We use the Benjamini Hochberg procedure. See https://en.wikipedia.org/wiki/False_discovery_rate for details.
tnseq$resTable$padj <- p.adjust(tnseq$resTable$pvalue, method = 'BH')
Order by most significant genes. NOTE that the pvalue can never be actually 0, but sometimes it is lower than the rounding error of the computer and gets represented as a 0. In your results, these should be described as \(p<<10^{-150}\) to avoid confusion.
tnseq$resTable <- tnseq$resTable[order(tnseq$resTable$padj),]
Write results tables as XLSX and CSV
write_xlsx(tnseq$resTable,
here("analysis", "04-statistical-analysis/", "tables",
"results.xlsx"))
write_csv(tnseq$resTable,
here("analysis", "04-statistical-analysis/", "tables",
"results.csv"))
write_xlsx(tnseq$est.insertion,
here("analysis", "04-statistical-analysis/", "tables",
"est_insertion.xlsx"))
write_csv(tnseq$est.insertion,
here("analysis", "04-statistical-analysis/", "tables",
"est_insertion.csv"))
reactable::reactable(tnseq$resTable, filterable = TRUE, compact = TRUE,
striped = TRUE)
A principal component analysis of samples based on read counts from TA insertions using limma::voom for data transformation. The transformation is not optimal for this kind of data and should be taken with a grain of salt.
gene_sums <- apply(sr_counts$tr_counts, 2, tapply, sr_counts$annotation$GeneID, sum)
voom_data <- limma::voom(gene_sums)$E
pca <- prcomp(t(voom_data))
tibble(PC1 = pca$x[, 1], PC2 = pca$x[, 2]) %>%
cbind(sample_info_tr) %>%
ggplot(aes(x = PC1, y = PC2, col = Condition,
pch = BiologicalReplicate)) +
geom_point(size = 4) +
my_theme -> pl
ggsave(here("analysis", "04-statistical-analysis/", "plots", "sample_pca.pdf"),
width = 12, height = 10, plot = pl)
ggplotly(pl)
gene_sums <- apply(sr_counts$counts, 2, tapply, sr_counts$annotation$GeneID, sum)
voom_data <- limma::voom(gene_sums)$E
colnames(voom_data) <- sample_info$Name
cors <- cor(voom_data)
corrplot(cors, type = "lower", is.corr = FALSE)
Top 20 genes are labeled in the print version of this plot
fc_name <- sym(colnames(tnseq$resTable)[7])
as_tibble(tnseq$resTable) %>%
ggplot(aes(x = !!fc_name,
y = -log10(padj + .Machine$double.eps),
label = ID)) +
geom_label_repel(data = head(tnseq$resTable, 20), direction = 'both') +
geom_point() +
my_theme -> pl
ggsave(here("analysis", "04-statistical-analysis/", "plots", "volcano_plot.pdf"),
width = 12, height = 10, plot = pl)
suppressWarnings(ggplotly(pl))
if (length(gois) > 0) {
gois_idx <- sapply(str_to_lower(tnseq$resTable$ID), str_detect, str_to_lower(gois))
if (any(colSums(gois_idx) > 1)) {
warning("Ambiguous gene names in gene of interest! Please provide unique names")
}
gois_idx <- apply(gois_idx, 1, which)
as_tibble(tnseq$resTable) -> j
j$GeneOfInterest <- FALSE
j$GeneOfInterest[gois_idx] <- TRUE
j %>%
ggplot(aes(x = !!fc_name,
y = -log10(padj + .Machine$double.eps),
label = ID,
col = GeneOfInterest)) +
geom_text_repel(data = j[gois_idx, ], direction = 'both') +
geom_point() +
my_theme -> pl
ggsave(here("analysis", "04-statistical-analysis/", "plots", "volcano_gois_plot.pdf"),
width = 12, height = 10, plot = pl)
suppressWarnings(ggplotly(pl))
}
as_tibble(tnseq$resTable) %>%
left_join(ta_df, by = c("ID" = "GeneID")) %>%
ggplot(aes(x = !!fc_name,
y = -log10(padj + .Machine$double.eps),
label = ID,
col = Chr)) +
geom_point() +
my_theme -> pl
ggsave(here("analysis", "04-statistical-analysis/", "plots", "volcano_chr_plot.pdf"),
width = 12, height = 10, plot = pl)
suppressWarnings(ggplotly(pl))